clean_get_addresses__get_town_from_list<-function(data){
#data<-ms_adm
  


replace_string<-paste0("ORASUL|LOCALITATEA|",
                       "\\(ENERGETIC\\)|\\(AUTO\\)|\\(EC\\. TEODOROIU\\)|ECONOMIC\\, ADMINISTRATIV SI DE SERVICII|",
                       "LOC\\.|MUN\\.| SAT$|COMUNA|ORAS|MUN |COM\\.|MUNICIPIUL|",
                       "TEHNOLOGIC DE TURISM SI ALIMENTATIE|MARMATIA|AL BANATULUI MONTAN|",
                       "PARTICULAR|DE ARTA|DEFICIENTI DE VEDERE|BILINGV ROMINO-CROAT|",
                       "DE CONSTRUCTII SI PROTECTIA MEDIULUI|PROFESIONALA DE COOPERATIE|",
                       "DE PROTECTIA MEDIULUI|WALDORF|DE VEST|",
                       "PENTRU EDUCATIE INCLUZIVA|DE ARTE SI MESERII|",
                       "CONSTR. CAI FERATE|DE CHIMIE INDUSTRIALA|",
                       "CONSTRUCTII MONTAJ|INDUSTRIAL MINIER|MINIER|",
                       "DE COOPERATIE|INDUSTRIE USOARA|ECONOMIC-ADMINISTRATIV|",
                       "MESERII SI SERVICII|INDUSTRIE MICA SI SERVICII|NATIONALA DE GAZ|",
                       "DE MUZICA SI ARTE PLASTICE|CU PROGRAM DE ATLETISM|DE MUZICA|",
                       "ADMINISTRATIV SI DE SERVICII|ALIMENTATIE PUBLICA|CU PROGRAM SPORTIV|",
                       "FORESTIER|INDUSTRIA STICLEI|COOPERATIST|COOPERATIE|",
                       "DE MARINA|DE AGRICULTURA SI ECONOMIE|CONSTRUCTII CAI FERATE|",
                       "DE TRANSPORTURI AUTO|BILINGV ROMANO-CROAT|TEHNIC DE TRANSPORTURI|",
                       "SPECIAL PENTRU DEFICIENTI DE AUZ|ADVENTIST|",
                       "TRANSPORTURI CAI FERATE|VOCATIONAL DE ARTA|ROMANO-CATOLIC|SPECIAL|AUTO|",
                       "SILVIC|TEHNIC|CONSTRUCTII DE MASINI|METALURGIC|PETROL|",
                       "BANATEAN|CARASAN|ALIMENTARA|PENTICOSTAL|BAPTIST|AGRICOL|DE AFACERI|",
                       "REFORMAT|TEOLOGIC|ORTODOX|TEHNOLOGIC FORESTIER|TEHNOLOGIC CONSTRUCTII DE MASINI|",
                       "SPORTIV|SANITAR|ENERGETIC|TEHNOLOGIC|PEDAGOGIC|TEORETIC|",
                       "DE INFORMATICA|ECONOMIC|TRANSPORTURI|ADMINISTRATIV|DE TURISM|PENTRU TRANSPORT RUTIER|",
                       "MAGHIARA|TELECOMUNICATII|DE ARTE|CONSTRUCTII|SERVICII|AUTOMECANICA|NATIONAL|",
                       "FEROVIAR|TEXTIL|GR. SC.|CENTRUL|SCOLAR|PARTICULARA|SANATORIALA|",
                        "THE CAMBRIDGE INTERNATIONAL SCHOOL IN|GRUPUL|GRUP|LICEUL|",
                      "SCOALA|GIMNAZIALA|SCOALA CU CLASELE I-VIII\\.|SCOALA CU CLASELE I-VIII|SCOALA CLASELE I-VIII|CLASELE 1-8|CLASE 1-8|",
                      "CLASE I-VIII|CU CLS\\. 1-8|GEN\\.|S08|SO8|GENERALA DE 8 ANI|GENERALA|SCOALACU|GIMNAZIUL|SCOALAI-VIII|",
                      "CLS\\.|1-VIII|1-8|I-VIII\\.|I-VIII|N[0-9]|8 CLASE|COLEGIUL|CU CLASELE|CORNERSTONE-FILIALA ASOCIATIEI HOPE FOR THE CHILDREN INTER|",
                      "I-XII|CORNERSTONE-FILIALA ASOCIATIEI HOPE FOR THE CHILDREN INTER|CU CLASE|CU CLASELE|CU CLS\\.|CLASELE|CLASE |CU CL\\.|",
                      "I-X|CENTRUL DE REEDUCARE MINORI|\\(|\\)|STRUCTURA| SI GRUPE DE PRESCOLARI|CU PREDARE IN LIMBA|",
                      "MAGHIARA|MAGH\\.|DE COREGRAFIE SI ARTA DRAMATICA|SPECIALA|CENTRU DE RESURSE SI DOCUMENTARE PRIVIND EDUCATIA INCLUSIVA/INTEGRATA|",
                      "FUNDATIA CULTURAL-UMANITARA|PRIVAT|PRIVATA|SC\\. GIM\\.|CU PROGRAM DE ED\\. FIZICA SI SPORT|SPORT|",
                      "EDUCATIE FIZICA|PENTRU DEFICIENTI DE AUZ|ROMANO-AMERICANA|CHIMIE|INDUSTRIAL|CU PREDARE IN LIMBA| CU PEDAGOGIE MONTESSORI|",
                      "CLUBUL COPIILOR SI ELEVILOR|CENTRUL DE REEDUCARE MINORI|COMPLEXUL EDUCATIONAL|GIMANZIALA|",
                      "DE ARET SI MESERII|DE ATE SI MESERII|AGROINDUSTRIAL|V-VIII|DE ARTE PLASTICE|TEOR\\.|",
                      "DE STAT|CU PROGRAM|CU PROGR\\.|GRI |PENTRU|DEFICIENTI|DEFICIENTE|VEDERE| DE VEDERE|",
                      "PREVENTORIU|TBC|I-IV|DE ATLETISM|ATELTISM|SPITAL|DE EDUCATIE|INCLUZIVA|INCLUZIV|",
                      "ARTE|VOCATIONAL|VOCATIONALA|BILINGV|BILINGVA|DE INDUSTRIE|INDUSTRIE|",
                      "USOARA|GREA |INCLUZIA|PENTRU|EDUCATIE|LICEU|DE INDUSTRIAL|SANATORIAL|CONSTR|DE MASINI|-SAT|",
                      "CU CL|SCOLAR|MONAHAL|DE COOPERATIE)|MILITAR|AGROMONTAN|DE ECOLOGIE SI PROTECTIA MEDIULUI|NAVAL|",
                      "CENTRU DE STUDII|DE CHIMIE|COM\\.|DE ECOLOGIE SI PROT\\. MED\\. )") 

replace_string2<-paste0("(^|(?<=\\s)|(?<=[[:punct:]]))SAT((?=\\s)(?=[[:punct:]])|$)|",
                        "(^|(?<=\\s)|(?<=[[:punct:]]))ARTA((?=\\s)|(?=[[:punct:]])|$)|",
                        "(^|(?<=\\s)|(?<=[[:punct:]]))COM((?=\\s)|(?=[[:punct:]])|$)|",
                        "(^|(?<=\\s)|(?<=[[:punct:]]))ORAS((?=\\s)|(?=[[:punct:]])|$)|",
                        "(^|(?<=\\s)|(?<=[[:punct:]]))LOC((?=\\s)|(?=[[:punct:]])|$)|",
                        "(^|(?<=\\s)|(?<=[[:punct:]]))AUTO((?=\\s)|(?=[[:punct:]])|$)|",
                        "(^|(?<=\\s)|(?<=[[:punct:]]))FEG((?=\\s)|(?=[[:punct:]])|$)")

#S08/SO8 -> CHANGE TO SCOALA CLASELE I-VIII
#SCOALACU->SCOALA CU
#SCOALAI-VIII
#CU PREDARE IN LIMBA MAGH.

#Get County Capitals; We will use this to determine distance between a town (outside a county) and a county
setwd(wd_data_raw_other)
county_capitals<-read.csv("county_capitals.csv",sep = ",",	encoding="UTF-8", stringsAsFactors=FALSE,quote="")
county_capitals$Judet<-toupper(county_capitals$Judet)
county_capitals<-county_capitals %>% mutate(Judet=ifelse(Judet=='SATU MARE','SATU-MARE',Judet)) %>% select(Judet,Lng,Lat) %>% rename(Lng_judet=Lng,Lat_judet=Lat)
data<-base::merge(data,county_capitals,by.x="judet",by.y="Judet")

#Load list of towns; We will use this to get GPS coordinates of towns
towns<-read.csv("orase.csv",sep = ",",	encoding="UTF-8", stringsAsFactors=FALSE,quote="")
towns$Loc<-toupper(towns$Loc)
towns$Judet<-toupper(towns$Judet)
towns<-towns %>% arrange(Judet,Loc)





#Extract town name
data$school_name_town<-gsub('"(.*?)"',"",data$school_name,perl=T)
data$school_name_town<-gsub(replace_string," ",data$school_name_town,perl=F)
data$school_name_town<-gsub(replace_string2," ",str_trim(data$school_name_town),perl=T)
data$school_name_town<-trimws(data$school_name_town)
data$school_name_town<-gsub('"(.*?)"',"",data$school_name_town,perl=T)

towns_with_numbers<-c('23 AUGUST','2 MAI','MILA 23')

#Remove Numbers from Town Names
number_string<-("NR\\.|NUMARUL|N[0-9]|NR|NR [0-9]|NR\\. [0-9]")
data$school_name_town<-gsub(number_string,"",data$school_name_town,perl=T)
data[!grepl(towns_with_numbers[1],data$school_name_town) &
        !grepl(towns_with_numbers[2],data$school_name_town) &
        !grepl(towns_with_numbers[3],data$school_name_town) ,]$school_name_town<-
gsub("[0-9]","",data[!grepl(towns_with_numbers[1],data$school_name_town) &
            !grepl(towns_with_numbers[2],data$school_name_town)&
            !grepl(towns_with_numbers[3],data$school_name_town) ,]$school_name_town)
data<-data %>% mutate(school_name_town=ifelse(school_name_town=='23 AUGUST-FLORENI','FLORENI',school_name_town))
data$school_name_town<-trimws(data$school_name_town)
data$school_name_town<-gsub(",","",data$school_name_town)
data$school_name_town<-gsub("\\s+", " ", trimws(data$school_name_town))

# data$school_name_town<-gsub(replace_string," ",data$school_name_town,perl=F)
# data$school_name_town<-gsub(replace_string2," ",str_trim(data$school_name_town),perl=T)
# data$school_name_town<-trimws(data$school_name_town)
# data$school_name_town<-paste(data$school_name_town,"Romania",sep=", ")

names <- strsplit(data$school_name_town, split=" ")
names<-unlist(lapply(1:length(names), function(x) unlist(paste(unique(names[[x]]), collapse = ' '))))
data$school_name_town<-names

#Calculate distances between towns and county capitals
data<-stringdist_left_join(data,towns,by=c("school_name_town"="Loc"),distance_col="dist")
data$geo_distance<-NA
data$geo_distance<-sqrt((data$Lng_judet-data$Lng)^2+(data$Lat_judet-data$Lat)^2)

#Get TOWNS
#1. first, try to keep only those matches within the same county and perfect name matches
perfect_match<-data %>% filter(dist==0 & judet==Judet)
perfect_match<-perfect_match %>% 
  group_by(judet,school_name) %>% 
  slice(which.min(Pop))  %>% 
  ungroup()

#2. get the unmatched ones and find those who are not perfect matches, but have the best match in same county
unmatched<-anti_join(data,perfect_match,by=c("judet","school_name"))
best_match_in_county<-unmatched %>% 
  filter(!is.na(dist)) %>% 
  group_by(judet,school_name) %>% 
  arrange(judet,school_name,dist,geo_distance,-Pop) %>%
  slice(which.min(dist)) %>%
  filter(judet==Judet) %>%
  ungroup()

#3. find the unmatched ones and find those who have a perfect match in another county; choose the closest county
unmatched<-anti_join(unmatched,best_match_in_county,by=c("judet","school_name"))
best_match_outside_county<-unmatched %>% 
  filter(!is.na(dist)) %>% 
  group_by(judet,school_name) %>% 
  arrange(judet,school_name,dist,geo_distance,-Pop) %>%
  slice(1) %>%
  filter(judet!=Judet) %>%
  ungroup()

unmatched<-anti_join(unmatched,best_match_outside_county,by=c("judet","school_name"))

data_refined<-rbind(perfect_match,best_match_in_county,best_match_outside_county,unmatched)
#data_refined$address<-NA
#data_refined$match_type<-NA
data_refined$lat<-data_refined$Lat
data_refined$lng<-data_refined$Lng

  data_refined<-data_refined %>% 
  select(judet,school_name,school_name_long,school_name_modified,school_name_town,address,lat,lng,match_type)
  data_refined$school_name_town<-paste(data_refined$school_name_town,", Romania",sep="")
  
  #Modified Name; remove all "noisy" components in the school name to make GPS match easier
  replace_string<-paste0("GIMNAZIALA|CU CLASELE I-VIII\\.|CU CLASELE I-VIII|CLASELE I-VIII|CLASELE 1-8|CLASE 1-8|",
                         "CLASE I-VIII|CU CLS\\. 1-8|GEN\\.|S08|SO8|GENERALA DE 8 ANI|GENERALA|GIMNAZIUL|",
                         "CLS\\.|1-VIII|1-8|I-VIII\\.|I-VIII|N[0-9]|8 CLASE|CU CLASELE|",
                         "I-XII|CU CLASE|CU CLASELE|CU CLS\\.|CLASELE|CLASE |CU CL\\.|",
                         "I-X|GIM\\.|",
                         "V-VIII|DE ARTE PLASTICE|TEOR\\.|",
                         "-SAT|CU CL")   
  
  data_refined$school_name_modified<-gsub(replace_string,"",data_refined$school_name_modified)
  data_refined$school_name_modified<-gsub("\\s+", " ", trimws(data_refined$school_name_modified))

test<-data_refined %>% group_by(judet,school_name) %>% mutate(n=n()) %>% filter(n>1) %>% arrange(judet,school_name)
return(data_refined)
}
